--- title: mahoudata keywords: fastai sidebar: home_sidebar summary: "API details." ---
The main idea of the data processing is to compute similarities among beers. In order to do so we are gonna compute similarities with regard to:
TODO:
#UDFs
class Preprocess:
def __init__(self, ctx):
self.ctx = ctx
def clean_duplicates(self):
#TODO:
# CHECK FOR DUPLICATES BASED ON DESCRIPTION AND ATTRIBUTES
# REMOVE THEM
return 1
def cols_munging(self, dataframe, fillna = True):
#Rename column
df = dataframe.rename(columns={"Temperatura Servicio":"temperatura"})
#Create ID for beers
df['beerID'] = (range(1, len(df) + 1))
df = df.set_index(df['beerID'].astype(str))
#Move beerID to first col
cols = df.columns.tolist()
cols.insert(0, cols.pop(cols.index('beerID')))
df = df.reindex(columns= cols)
#fillna with 0
#TODO: Augment to replace by median/mean
if fillna:
df = df.fillna(0)
return df
def scale_cols(self, dataframe):
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(
scaler.fit_transform(dataframe[self.ctx['numeric_cols']]),
columns=dataframe[self.ctx['numeric_cols']].columns
)
return df_scaled
class RecomenderStrategyFactory:
def __init__(self, ctx):
self.context = ctx
def createStrategy(self, strategy):
recommender_strategy = strategy.lower()
if recommender_strategy == 'numeric':
instance = NumericStrategy(self.context)
else:
instance = DescriptionAndNumeric(self.context)
return instance
class NumericStrategy:
def __init__(self, ctx):
self.ctx = ctx
def model_builder(self, dataframe):
preprocessor = Preprocess(self.ctx)
df = preprocessor.cols_munging(dataframe, fillna = True)
df = preprocessor.scale_cols(df)
return df
def exec_strategy(self, dataframe, distance = 'cosine'):
if distance == 'euclidean':
recommender_df = pd.DataFrame(
squareform(pdist(dataframe[self.ctx['numeric_cols']])),
columns = dataframe.index.astype(str),
index = dataframe.index
)
else:
recommender_df = pd.DataFrame(
squareform(pdist(dataframe[self.ctx['numeric_cols']], metric = 'cosine')),
columns = dataframe.index,
index = dataframe.index
)
return recommender_df
df = pd.read_csv("./data/dataset-datathon.csv")
profile = ProfileReport(df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
According to profile there are 60% duplicates. Get rid of them
df_clean = df.drop_duplicates(
#subset = df.columns.difference(['vajilla'])
)
profile = ProfileReport(df_clean, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
context = {'numeric_cols' : ['lupulo_afrutado_citrico',
'lupulo_floral_herbal','amargor', 'color',
'maltoso', 'licoroso', 'afrutado', 'especias','acidez']
}
f = RecomenderStrategyFactory(context)
strategy = f.createStrategy('numeric')
datamodel = strategy.model_builder(df_clean)
recommender_df = strategy.exec_strategy(datamodel)
recommender_df
recommendations_example = pd.DataFrame(recommender_df[1].sort_values(ascending=True))
recommendations_example
Below you can find work in progress
#Reshape to long form
long_form_cosine = recommender_df.unstack()
#rename columns and turn into a dataframe
long_form_cosine.index.rename(['Beer A', 'Beer B'], inplace=True)
long_form_cosine = long_form_cosine.to_frame('cosine distance').reset_index()
long_form_cosine
#df['tokenized_desc'] = df['desc'].apply(word_tokenize)